
#!/bin/bash
set -x  # 显示每条命令
echo "Starting script execution at $(date)"
# export NCCL_IB_DISABLE=0
# export NCCL_IB_HCA=mlx5
# export NCCL_DEBUG=INFO
# export NCCL_SOCKET_IFNAME=eno1,ibp1s0
# export GLOO_SOCKET_IFNAME=eteno10,ibp1s0
# export NCCL_DEBUG_SUBSYS=ALL


# export NCCL_DEBUG=INFO
# export NCCL_IB_DISABLE=0
# export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3
# export NCCL_SOCKET_IFNAME=ibp1s0
# export GLOO_SOCKET_IFNAME=eno1,ibp1s0
# # export NCCL_SOCKET_IFNAME=en,eth,em,bond
# ulimit -l unlimited
# Disable tokenizers parallelism


export NCCL_DEBUG=INFO
export NCCL_IB_DISABLE=1
export NCCL_SOCKET_IFNAME=eno1
export GLOO_SOCKET_IFNAME=eno1
export PYTORCH_HIP_ALLOC_CONF=expandable_segments:True


export TOKENIZERS_PARALLELISM=false
echo "TOKENIZERS_PARALLELISM: $TOKENIZERS_PARALLELISM"

export PYTORCH_HIP_ALLOC_CONF=expandable_segments:True
# export NCCL_IB_GID_INDEX=3

# # Conda activation (must be executed on all nodes)
# source /m2v_intern/liujie/miniconda3/etc/profile.d/conda.sh
# conda deactivate
# conda activate /m2v_intern/liujie/miniconda3/envs/flow_grpo

# Project root directory (modify according to actual path)
PROJECT_ROOT="/root/xjh/flow_grpo-main"
cd $PROJECT_ROOT

MASTER_PORT=19001
RANK=0
MASTER_ADDR=10.8.160.45

# Launch command (parameters automatically read from accelerate_multi_node.yaml)
accelerate launch --config_file $PROJECT_ROOT/scripts/accelerate_configs/multi_node.yaml \
    --num_machines 2 --num_processes 8 \
    --machine_rank ${RANK} --main_process_ip ${MASTER_ADDR} --main_process_port ${MASTER_PORT} \
    scripts/train_sd3.py \
    --config config/grpo.py:geneval_sd3

